{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "jn4VjbRSEm8L"
   },
   "outputs": [],
   "source": [
    "try:\n",
    "    import evidently\n",
    "except:\n",
    "    !npm install -g yarn\n",
    "    !pip install git+https://github.com/evidentlyai/evidently.git"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "from evidently.test_suite import TestSuite\n",
    "\n",
    "from sklearn.datasets import fetch_openml\n",
    "\n",
    "data = fetch_openml(name='adult', version=2, as_frame='auto')\n",
    "df = data.frame\n",
    "\n",
    "reference_data = df[~df.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]\n",
    "current_data = df[df.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]\n",
    "\n",
    "current_data.iloc[:2000, 3:5] = np.nan\n",
    "current_data.iloc[:2000, 12] = np.nan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "zGBzUG6ThEU_"
   },
   "source": [
    "# Test Missing Values\n",
    "In datasets missing values could be encoded with different values: None, NaN, an empty string, zero, etc.\n",
    "\n",
    "With the test you can check a number or share of such values with your missing values setup."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tests with Default Missing Values List\n",
    "As default missing values we use\n",
    "- null-values from Pandas\n",
    "- empty stings\n",
    "- INF values from Numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from evidently.tests import TestNumberOfMissingValues\n",
    "from evidently.tests import TestShareOfMissingValues\n",
    "from evidently.tests import TestNumberOfColumnsWithMissingValues\n",
    "from evidently.tests import TestShareOfColumnsWithMissingValues\n",
    "from evidently.tests import TestNumberOfRowsWithMissingValues\n",
    "from evidently.tests import TestShareOfRowsWithMissingValues\n",
    "from evidently.tests import TestNumberOfDifferentMissingValues\n",
    "\n",
    "suite = TestSuite(tests=[\n",
    "    TestNumberOfMissingValues(),\n",
    "    TestShareOfMissingValues(),\n",
    "    TestNumberOfColumnsWithMissingValues(),\n",
    "    TestShareOfColumnsWithMissingValues(),\n",
    "    TestNumberOfRowsWithMissingValues(),\n",
    "    TestShareOfRowsWithMissingValues(),\n",
    "    TestNumberOfDifferentMissingValues(),\n",
    "])\n",
    "suite.run(current_data=current_data, reference_data=reference_data)\n",
    "suite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from evidently.tests import TestColumnNumberOfMissingValues\n",
    "from evidently.tests import TestColumnShareOfMissingValues\n",
    "from evidently.tests import TestColumnNumberOfDifferentMissingValues\n",
    "\n",
    "suite = TestSuite(tests=[\n",
    "    TestColumnNumberOfMissingValues(column_name='native-country'),\n",
    "    TestColumnShareOfMissingValues(column_name='native-country'),\n",
    "    TestColumnNumberOfDifferentMissingValues(column_name='native-country'),\n",
    "])\n",
    "suite.run(current_data=current_data, reference_data=reference_data)\n",
    "suite"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Missing Values Tests with Custom Missing Values List\n",
    "\n",
    "You can set up your own missing values list like 0, -9999 or \"zero\", or \"no-value\" with parameter **missing_values**.\n",
    "If you want to add your values to defaults, set parameter **replace** to False.\n",
    "If you want to use your values only, set parameter **replace** to True.\n",
    "\n",
    "If you want to add Pandas missing values to your list, add None to it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "HoDiwl7TLI_U",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# set all ages less than 20 to 0 and greater than 30 to INF\n",
    "current_data.loc[current_data['age'] < 20, 'age'] = 0\n",
    "current_data.loc[current_data['age'] > 30, 'age'] = np.inf\n",
    "\n",
    "# run tests for age and specify a custom missing value\n",
    "suite = TestSuite(tests=[\n",
    "    TestColumnNumberOfMissingValues(column_name='age'),\n",
    "    TestColumnShareOfMissingValues(column_name='age'),\n",
    "    TestColumnNumberOfDifferentMissingValues(column_name='age'),\n",
    "    # add 0 value to default missing values list\n",
    "    TestColumnNumberOfMissingValues(column_name='age', missing_values=[0], replace=False),\n",
    "    TestColumnShareOfMissingValues(column_name='age', missing_values=[0], replace=False),\n",
    "    TestColumnNumberOfDifferentMissingValues(column_name='age', missing_values=[0], replace=False),\n",
    "    # use 0 and missing values from Pandas as missing values list\n",
    "    TestColumnNumberOfMissingValues(column_name='age', missing_values=[0, None], replace=True),\n",
    "    TestColumnShareOfMissingValues(column_name='age', missing_values=[0, None], replace=True),\n",
    "    TestColumnNumberOfDifferentMissingValues(column_name='age', missing_values=[0], replace=True),\n",
    "])\n",
    "suite.run(current_data=current_data, reference_data=reference_data)\n",
    "suite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "example_tests.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
